In [1]:
import os 
import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline 
In [2]:
# !pip install pandas_profiling;
# !pip install emot;
# !pip install ipywidgets;
# !pip install -U kaleido

Загрузка данных¶

In [3]:
df_path = os.path.join('..', 'data', 'train.csv')
df = pd.read_csv(df_path)

print(f"SHAPE: {df.shape}")
df.head(3)
SHAPE: (7613, 5)
Out[3]:
id keyword location text target
0 1 NaN NaN Our Deeds are the Reason of this #earthquake M... 1
1 4 NaN NaN Forest fire near La Ronge Sask. Canada 1
2 5 NaN NaN All residents asked to 'shelter in place' are ... 1

Анализ данных¶

In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
In [5]:
# Поиск дубликатов
print(f"Dplicates: {df.duplicated().any()}")
Dplicates: False
In [6]:
print(df['location'].unique())
[nan 'Birmingham' 'Est. September 2012 - Bristol' ... 'Vancouver, Canada'
 'London ' 'Lincoln']

Поиск и удаление аномалий¶

In [7]:
# Поиск аномалий (Сообщений с одинаковым текстом но разными метками)
df_mislabeled = df.groupby(['text']).nunique().sort_values(by='target', ascending=False)
df_mislabeled = df_mislabeled[df_mislabeled['target'] > 1]['target']
df_mislabeled
Out[7]:
text
like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit                  2
Hellfire! We don‰Ûªt even want to think about it or mention it so let‰Ûªs not do anything that leads to it #islam!                             2
The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'                             2
In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!                                         2
To fight bioterrorism sir.                                                                                                                     2
Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE    2
#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption                          2
#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect     2
He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam                    2
RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG                                                 2
Hellfire is surrounded by desires so be careful and don‰Ûªt let your desires control you! #Afterlife                                           2
CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring                                                       2
Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh...                                                           2
wowo--=== 12000 Nigerian refugees repatriated from Cameroon                                                                                    2
.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4         2
Caution: breathing may be hazardous to your health.                                                                                            2
I Pledge Allegiance To The P.O.P.E. And The Burning Buildings of Epic City. ??????                                                             2
that horrible sinking feeling when you‰Ûªve been at home on your phone for a while and you realise its been on 3G this whole time              2
Name: target, dtype: int64
In [8]:
# Удаление выбросов
df = df[~df['text'].isin(df_mislabeled.index.to_list())]
df.reset_index(inplace=True)
print(f"Shape: {df.shape}")
Shape: (7558, 6)

TMP¶

In [9]:
fig = px.pie(df, names='target', title='Соотношение классов таргета')
fig.show()
In [10]:
fig = px.histogram(data_frame=df.sort_values('target'), x="keyword", color='target')
fig.update_xaxes(categoryorder='min ascending')
# fig.update_xaxes(tickangle = -90)|
fig.show()
In [11]:
fig, ax = plt.subplots(figsize=(10, 7))
sns.countplot(
    ax = ax, 
    y = df['location'], 
    order = df['location'].value_counts().iloc[:15].index,
)
ax.set_title('Top 25 location from the tweets')
plt.grid()
plt.show()

Предобработка текста¶

In [12]:
# Конвертация эмодзи и текстовых смайликов
import re
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
from pandas_profiling import ProfileReport
In [13]:
# Конвертация эмодзи в слова
def convert_emojis(text):
    for emot in UNICODE_EMOJI:
        text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",","").replace(":","").split()))
#         text = text.replace(emot, "")
        return text
    
# Converting emoticons to words    
def convert_emoticons(text):
    for emot in EMOTICONS_EMO:
        text = text.replace(emot, EMOTICONS_EMO[emot].replace(" ","_"))
#         text = text.replace(emot, "")
        return text
In [14]:
import string

def clean_text(text: str) -> str:
    text = str(text).lower()
    clean_text = text.replace('{html}',"") # removing html files
#     clean_text = re.sub('<.*?>', '', clean_text) # Removing punctuation
    clean_text = re.sub(fr'[{string.punctuation}]', '', clean_text) # Removing punctuation
    clean_text = re.sub(r'http\S+', '',clean_text) # Removing links
    clean_text = re.sub('[0-9]+', '', clean_text)  # Removing numbers
    return clean_text
In [15]:
import nltk
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
# from pymystem3 import Mystem


# nltk.download('wordnet')
nltk.download('stopwords')

en_stopwords = set(stopwords.words("english"))

wnl = WordNetLemmatizer()
def lemmatize_text(text: str):
    return " ".join([wnl.lemmatize(w) for w in text.split() if w not in en_stopwords])
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [16]:
df['text'] = df['text'].apply(convert_emoticons)
df['text'] = df['text'].apply(convert_emojis)
df['clean_text'] = df['text'].apply(lemmatize_text)
df['clean_text'] = df['clean_text'].apply(clean_text)

df['keyword'].fillna('UNKNOW', inplace=True)
df['location'].fillna('UNKNOW', inplace=True)

df['text_len'] = df['clean_text'].apply(len)
In [17]:
from collections import Counter

# Removing the frequent words
cnt = Counter()

for text in df["clean_text"].values:
    for word in text.split():
        cnt[word] += 1
        
px.bar(x=[w for w, c in cnt.most_common()[:100]], y=[c for w, c in cnt.most_common()[:100]])
In [18]:
cnt.most_common(15)
Out[18]:
[('i', 1255),
 ('the', 676),
 ('a', 342),
 ('like', 342),
 ('fire', 316),
 ('amp', 300),
 ('im', 293),
 ('get', 252),
 ('new', 224),
 ('via', 220),
 ('in', 218),
 ('one', 200),
 ('news', 197),
 ('people', 190),
 ('video', 170)]
In [19]:
freq = set([w for (w, wc) in cnt.most_common(2)])

# function to remove the frequent words
def freqwords(text):
    return " ".join([word for word in str(text).split() if word not in freq])

# Passing the function freqwords
df["clean_text"] = df["clean_text"].apply(freqwords)
df["clean_text"].head()
Out[19]:
0      our deeds reason earthquake may allah forgive u
1                forest fire near la ronge sask canada
2    all resident asked shelter place notified offi...
3    people receive wildfires evacuation order cali...
4    just got sent photo ruby alaska smoke wildfire...
Name: clean_text, dtype: object
In [20]:
df.head()
Out[20]:
index id keyword location text target clean_text text_len
0 0 1 UNKNOW UNKNOW Our Deeds are the Reason of this #earthquake M... 1 our deeds reason earthquake may allah forgive u 47
1 1 4 UNKNOW UNKNOW Forest fire near La Ronge Sask. Canada 1 forest fire near la ronge sask canada 37
2 2 5 UNKNOW UNKNOW All residents asked to 'shelter in place' are ... 1 all resident asked shelter place notified offi... 93
3 3 6 UNKNOW UNKNOW 13,000 people receive #wildfires evacuation or... 1 people receive wildfires evacuation order cali... 53
4 4 7 UNKNOW UNKNOW Just got sent this photo from Ruby #Alaska as ... 1 just got sent photo ruby alaska smoke wildfire... 60
In [21]:
profile = ProfileReport(df, html={'style':{'full_width':True}})
profile 
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[21]:

Подготовка обучающей выборки¶

In [22]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score, classification_report
In [23]:
# prepared_df = df[['clean_text', 'text_len', 'target']]
prepared_df = df[['keyword', 'location', 'clean_text', 'text_len', 'target']]
X_train, X_test, y_train, y_test = train_test_split(
    prepared_df.drop('target', axis=1),
    prepared_df['target'], 
    test_size=0.3, 
    random_state=42
)
print(f"SHAPES:\nX_train: {X_train.shape}\ty_train: {y_train.shape}\nX_test: {X_test.shape}\ty_test: {y_test.shape}")
SHAPES:
X_train: (5290, 4)	y_train: (5290,)
X_test: (2268, 4)	y_test: (2268,)

Построение ML-моделей¶

In [24]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler, Normalizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.linear_model import LogisticRegression
from sklearn import svm

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
In [25]:
class ModifiedLabelEncoder(LabelEncoder):
    def __init__(self):
        self.encoders = []
        
    def fit_transform(self, X, *args, **kwargs):
        for i, x in enumerate(X):
            self.encoders.append(LabelEncoder())
            X[i] = self.encoders[i].fit_transform(x)
        return X

    def transform(self, X, *args, **kwargs):
        for i, x in enumerate(X):
            X[i] = self.encoders[i].transform(x)
        return X
In [26]:
def create_pipeline(model):
    # CATEGORICAL
    categorical_features = ['keyword', 'location']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
        ('label_encoder', OneHotEncoder(handle_unknown='ignore')),
#         ('label_encoder', ModifiedLabelEncoder())
    ])

    # NUMERIC
    numeric_features = ['text_len']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')), 
        ('scaler', StandardScaler())
    ])

    # TEXT 
    text_features = ['clean_text']
    text_transformer = Pipeline(steps=[
        ('vectorizer', TfidfVectorizer(ngram_range=(1, 3), sublinear_tf=True, max_features=20_000)),
        ('svd', TruncatedSVD(n_components=1000, random_state=42))
    ])

    # PIPELINE
    pipeline = Pipeline(steps=[
        ('preprocessor', ColumnTransformer(transformers=[
            ('cat', categorical_transformer, categorical_features),
            ('num', numeric_transformer, numeric_features),
            ('text', text_transformer, 'clean_text'),
        ], remainder='drop', verbose=False)), 
        ('clf', model)
    ])
    
    return pipeline

LogLos¶

In [27]:
%%time
parameters = {
    'clf__penalty':['l1', 'l2', 'elasticnet'], 
    'clf__C':[0.5, 1.0, 1.5, 2.0, 2.5, 3.0], 
    'clf__class_weight': [None, 'balanced'],
    'clf__solver': ['liblinear', 'saga']
}

log_reg = LogisticRegression(random_state=42)
log_reg_pipe = create_pipeline(log_reg)
log_reg_clf = GridSearchCV(log_reg_pipe, parameters, scoring='f1', n_jobs=-1)
log_reg_clf.fit(X_train, y_train.values)
Wall time: 32min 21s
Out[27]:
GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('label_encoder',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['keyword',
                                                                          'location']),
                                                                        ('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['text_len']),
                                                                        ('text',
                                                                         Pi...
                                                                                          TfidfVectorizer(max_features=20000,
                                                                                                          ngram_range=(1,
                                                                                                                       3),
                                                                                                          sublinear_tf=True)),
                                                                                         ('svd',
                                                                                          TruncatedSVD(n_components=1000,
                                                                                                       random_state=42))]),
                                                                         'clean_text')])),
                                       ('clf',
                                        LogisticRegression(random_state=42))]),
             n_jobs=-1,
             param_grid={'clf__C': [0.5, 1.0, 1.5, 2.0, 2.5, 3.0],
                         'clf__class_weight': [None, 'balanced'],
                         'clf__penalty': ['l1', 'l2', 'elasticnet'],
                         'clf__solver': ['liblinear', 'saga']},
             scoring='f1')
In [28]:
log_reg_clf.best_estimator_, log_reg_clf.best_params_, log_reg_clf.best_score_,  
Out[28]:
(Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('cat',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer(strategy='most_frequent')),
                                                                   ('label_encoder',
                                                                    OneHotEncoder(handle_unknown='ignore'))]),
                                                   ['keyword', 'location']),
                                                  ('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer()),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   ['text_len']),
                                                  ('text',
                                                   Pipeline(steps=[('vectorizer',
                                                                    TfidfVectorizer(max_features=20000,
                                                                                    ngram_range=(1,
                                                                                                 3),
                                                                                    sublinear_tf=True)),
                                                                   ('svd',
                                                                    TruncatedSVD(n_components=1000,
                                                                                 random_state=42))]),
                                                   'clean_text')])),
                 ('clf',
                  LogisticRegression(C=1.5, class_weight='balanced',
                                     penalty='l1', random_state=42,
                                     solver='liblinear'))]),
 {'clf__C': 1.5,
  'clf__class_weight': 'balanced',
  'clf__penalty': 'l1',
  'clf__solver': 'liblinear'},
 0.7598162562575588)
In [29]:
preds = log_reg_clf.predict_proba(X_test)
print(f"roc_auc: {roc_auc_score(y_test, preds[:, 1])}")

preds = log_reg_clf.predict(X_test)
print(f"f1_score: {f1_score(y_test, preds)}")
print(classification_report(y_test, preds))
roc_auc: 0.8630197931860202
f1_score: 0.7621009268795057
              precision    recall  f1-score   support

           0       0.83      0.81      0.82      1309
           1       0.75      0.77      0.76       959

    accuracy                           0.80      2268
   macro avg       0.79      0.79      0.79      2268
weighted avg       0.80      0.80      0.80      2268

CatBoost¶

In [30]:
from catboost import Pool
from catboost import CatBoostClassifier
In [31]:
train_pool = Pool(
    data = X_train, 
    label = y_train, 
    cat_features = ['keyword', 'location'],
    text_features = ['clean_text']
)
test_pool = Pool(
    data = X_test, 
    label = y_test,
    cat_features = ['keyword', 'location'],
    text_features = ['clean_text'])

print('Train dataset shape: {}\n'.format(train_pool.shape))
Train dataset shape: (5290, 4)

In [32]:
def fit_model(train_pool, test_pool, **kwargs):
    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        eval_metric='AUC',
        **kwargs
    )

    return model.fit(
        train_pool,
        eval_set=test_pool,
        verbose=500,
    )

cb_model_clf = fit_model(train_pool, test_pool)
0:	test: 0.7744655	best: 0.7744655 (0)	total: 235ms	remaining: 3m 54s
500:	test: 0.8576487	best: 0.8576487 (500)	total: 53s	remaining: 52.8s
999:	test: 0.8637670	best: 0.8639948 (974)	total: 1m 45s	remaining: 0us

bestTest = 0.8639948348
bestIteration = 974

Shrink model to first 975 iterations.
In [33]:
preds = cb_model_clf.predict_proba(X_test)
print(f"roc_auc: {roc_auc_score(y_test, preds[:, 1])}")

preds = cb_model_clf.predict(X_test)
print(f"f1_score: {f1_score(y_test, preds)}")
print(classification_report(y_test, preds))
roc_auc: 0.8639948348284239
f1_score: 0.7557251908396948
              precision    recall  f1-score   support

           0       0.81      0.86      0.83      1309
           1       0.79      0.72      0.76       959

    accuracy                           0.80      2268
   macro avg       0.80      0.79      0.79      2268
weighted avg       0.80      0.80      0.80      2268

SVM¶

In [29]:
%%time
parameters = { 
    'clf__C':[0.5, 0.7, 1.0, 1.3, 5.0, 10.0], 
    'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'clf__class_weight': [None, 'balanced']
}

svc = svm.SVC(random_state=42, probability=True)
svm_pipe = create_pipeline(svc)
svm_clf = GridSearchCV(svm_pipe, parameters, scoring='roc_auc', n_jobs=-2)
svm_clf.fit(X_train, y_train.values)
Wall time: 4h 23min 51s
Out[29]:
GridSearchCV(estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='most_frequent')),
                                                                                         ('label_encoder',
                                                                                          OneHotEncoder(handle_unknown='ignore'))]),
                                                                         ['keyword',
                                                                          'location']),
                                                                        ('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['text_len']),
                                                                        ('text',
                                                                         Pi...
                                                                                          TfidfVectorizer(max_features=20000,
                                                                                                          ngram_range=(1,
                                                                                                                       2),
                                                                                                          sublinear_tf=True)),
                                                                                         ('svd',
                                                                                          TruncatedSVD(n_components=1000,
                                                                                                       random_state=42))]),
                                                                         'clean_text')])),
                                       ('clf',
                                        SVC(probability=True,
                                            random_state=42))]),
             n_jobs=-2,
             param_grid={'clf__C': [0.5, 0.7, 1.0, 1.3, 5.0, 10.0],
                         'clf__class_weight': [None, 'balanced'],
                         'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             scoring='roc_auc')
In [30]:
# svm_clf.best_estimator_, svm_clf.best_params_, svm_clf.best_score_,  
In [31]:
preds = svm_clf.predict_proba(X_test)
roc_auc_score(y_test, preds[:, 1])
Out[31]:
0.8612943518482377
In [46]:
preds = model.predict(X_test)
print(classification_report(y_test, preds))
              precision    recall  f1-score   support

           0       0.79      0.90      0.84      1309
           1       0.83      0.68      0.75       959

    accuracy                           0.81      2268
   macro avg       0.81      0.79      0.79      2268
weighted avg       0.81      0.81      0.80      2268

In [ ]: